# Updated by Wei on 2014/02/22 night at school

from __future__ import division
from operator import itemgetter, attrgetter

import gc
import math
import matplotlib
import os
import pylab
import random
import sys
import time
from sets import Set
from scipy import stats
import numpy as np

numOfDocumentResultRetained = 0

top10PostingPoppedDict = {}

# pangolin:
# WRONG data file:
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140222Night_DEBUG_OR_1%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140222Night_DEBUG_OR_3%"
# dodo:
# weight == 15
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_15_20140225Morning_DEBUG_OR_1%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_15_20140225Morning_DEBUG_OR_3%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_15_20140225Morning_DEBUG_OR_5%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_15_20140225Morning_DEBUG_OR_10%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_15_20140225Morning_DEBUG_OR_15%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_15_20140225Morning_DEBUG_OR_20%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_15_20140225Morning_DEBUG_OR_30%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_15_20140225Morning_DEBUG_OR_40%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_15_20140225Morning_DEBUG_OR_50%"

# weight == 10
inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/results/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140224Afternoon_DEBUG_OR_1%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140224Afternoon_DEBUG_OR_3%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140224Afternoon_DEBUG_OR_5%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140224Afternoon_DEBUG_OR_10%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140224Afternoon_DEBUG_OR_15%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140224Afternoon_DEBUG_OR_20%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140224Afternoon_DEBUG_OR_30%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140224Afternoon_DEBUG_OR_40%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140224Afternoon_DEBUG_OR_50%"

# weight == 5
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140224Afternoon_DEBUG_OR_1%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140224Afternoon_DEBUG_OR_3%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140224Afternoon_DEBUG_OR_5%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140224Afternoon_DEBUG_OR_10%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140224Afternoon_DEBUG_OR_15%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140224Afternoon_DEBUG_OR_20%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140224Afternoon_DEBUG_OR_30%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140224Afternoon_DEBUG_OR_40%"
# inputFileName2 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140224Afternoon_DEBUG_OR_50%"
                                                                                        

# weight 0 with OR semantics:
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140224Afternoon_DEBUG_1%_OR"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140224Afternoon_DEBUG_3%_OR"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140224Afternoon_DEBUG_5%_OR"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140224Afternoon_DEBUG_10%_OR"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140224Afternoon_DEBUG_15%_OR"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140224Afternoon_DEBUG_20%_OR"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140224Afternoon_DEBUG_30%_OR"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140224Afternoon_DEBUG_40%_OR"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140224Afternoon_DEBUG_50%_OR"

print "inputFileName2:",inputFileName2
inputFileHandler = open(inputFileName2,"r")
# ignore the headline
inputFileHandler.readline()
currentLine = inputFileHandler.readline()
while currentLine:
    top10PostingKey = currentLine.strip().split(" ")[0]
    if top10PostingKey not in top10PostingPoppedDict:
        top10PostingPoppedDict[top10PostingKey] = 1
    else:
        top10PostingPoppedDict[top10PostingKey] += 1
    currentLine = inputFileHandler.readline()
print "len(top10PostingPoppedDict): ",len(top10PostingPoppedDict)
inputFileHandler.close()

qidANDLowerThresholdDict = {}
# dodo
inputFileName0 = "/home/diaosi/web-search-engine-wei_MOVE_FROM_PANGOLIN_20131206/polyIRIndexer/tail5KResults_20140222Night_lowerBoundThresholds_sortedByQID"
# pangolin:
# inputFileName0 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/tail5KResults_20140222Night_lowerBoundThresholds_sortedByQID"
inputFileHanlder = open(inputFileName0,"r")
for line in inputFileHanlder.readlines():
    lineElements = line.strip().split(" ")
    qid = lineElements[0]
    lowerBound = float(lineElements[1])
    if qid not in qidANDLowerThresholdDict:
        qidANDLowerThresholdDict[qid] = lowerBound
print "len(qidANDLowerThresholdDict):",len(qidANDLowerThresholdDict)
inputFileHanlder.close()

postingScoreDict = {}
documentResultDict = {}
# dodo
inputFileName1 = "/home/diaosi/workspace/web-search-engine-wei-2014-March/results/tail5KResults_NEW_FORMAT_20140222Afternoon_OR"
# pangolin:
# inputFileName1 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/tail5KResults_NEW_FORMAT_20140222Afternoon_OR"
inputFileHandler = open(inputFileName1,"r")
currentLine = inputFileHandler.readline()
while currentLine:
    currentLineElements = currentLine.strip().split(" ")
    qIDInStringFormat = currentLineElements[0]
    docIDInStringFormat = currentLineElements[1]
    termIDInStringFormat = currentLineElements[2]
    postingScore = float( currentLineElements[3] )
    documentResultKey = qIDInStringFormat + "_" + docIDInStringFormat
    postingKey = termIDInStringFormat + "_" + docIDInStringFormat
    
    if postingKey not in postingScoreDict:
        postingScoreDict[postingKey] = postingScore
    
    if documentResultKey not in documentResultDict:
        documentResultDict[documentResultKey] = []
        documentResultDict[documentResultKey].append(termIDInStringFormat)
    else:
        documentResultDict[documentResultKey].append(termIDInStringFormat)
    currentLine = inputFileHandler.readline()
print "len(documentResultDict): ",len(documentResultDict)
print "len(postingScoreDict): ",len(postingScoreDict)
print "postingScoreDict['30128082_24914902']:",postingScoreDict['30128082_24914902']
inputFileHandler.close()

top10PostingCounted = 0
for documentResultKey in documentResultDict:
    qid = documentResultKey.strip().split("_")[0]
    docID = documentResultKey.strip().split("_")[1]
    accumulateScore = 0.0
    for termID in documentResultDict[documentResultKey]:
        top10PostingKey = termID + "_" + docID 
        if top10PostingKey in top10PostingPoppedDict:
            currentScore = postingScoreDict[top10PostingKey]
            if currentScore != 0.0:
                top10PostingCounted += 1
            else:
                print top10PostingKey,"has the score 0"
            accumulateScore += currentScore
    if accumulateScore >= qidANDLowerThresholdDict[qid]:
        numOfDocumentResultRetained += 1
print "numOfDocumentResultRetained: ",numOfDocumentResultRetained
print "top10PostingCounted: ",top10PostingCounted
exit(1)


